home *** CD-ROM | disk | FTP | other *** search
- /*
- ** Copyright (C) 1995, Enterprise Integration Technologies Corp.
- ** All Rights Reserved.
- ** Kevin Hughes, kevinh@eit.com
- ** 3/11/94
- */
-
- #include "swish.h"
- #include "merge.h"
-
- /* The main merge functions - it accepts three file names.
- ** This is a bit hairy. It basically acts as a zipper,
- ** zipping up both index files into one.
- */
-
- void readmerge(file1, file2, outfile)
- char *file1;
- char *file2;
- char *outfile;
- {
- int i, j, indexfilenum1, indexfilenum2, result, totalfiles,
- skipwords, skipfiles;
- long limit1, limit2, fileinfo1, fileinfo2, offsetstart;
- char line[MAXSTRLEN];
- struct indexentry *ip1, *ip2, *ip3;
- struct indexentry *buffer1, *buffer2;
- FILE *fp1, *fp2, *fp3;
-
- initindexfilehashlist();
-
- if ((fp1 = fopen(file1, "r")) == NULL) {
- sprintf(errorstr, "Couldn't read the index file \"%s\".",
- file1);
- progerr(errorstr);
- }
- if (!isokindexheader(fp1)) {
- sprintf(errorstr, "\"%s\" has an unknown format.",
- file1);
- progerr(errorstr);
- }
- if ((fp2 = fopen(file2, "r")) == NULL) {
- sprintf(errorstr, "Couldn't read the index file \"%s\".",
- file2);
- progerr(errorstr);
- }
- if (!isokindexheader(fp2)) {
- sprintf(errorstr, "\"%s\" has an unknown format.",
- file2);
- progerr(errorstr);
- }
-
- ip1 = ip2 = ip3 = NULL;
- buffer1 = buffer2 = NULL;
- if (verbose)
- printf("Counting files... ");
- indexfilenum1 = getindexfilenum(fp1);
- indexfilenum2 = getindexfilenum(fp2);
- totalfiles = indexfilenum1 + indexfilenum2;
- if (verbose) {
- printf("%d files.\n", indexfilenum1 + indexfilenum2);
- printf("Reading stopwords...");
- }
- readoffsets(fp1);
- readstopwords(fp1);
- limit1 = offsets[STOPWORDPOS];
- fileinfo1 = offsets[FILELISTPOS];
- readoffsets(fp2);
- readstopwords(fp2);
- limit2 = offsets[STOPWORDPOS];
- fileinfo2 = offsets[FILELISTPOS];
-
- if (verbose)
- printf("\nReading file info...");
- fseek(fp1, fileinfo1, 0);
- for (i = 1; i <= indexfilenum1; i++) {
- fgets(line, MAXSTRLEN, fp1);
- addindexfilelist(i, line, &totalfiles);
- }
- fseek(fp2, fileinfo2, 0);
- for (i = 1; i <= indexfilenum2; i++) {
- fgets(line, MAXSTRLEN, fp2);
- addindexfilelist(i + indexfilenum1, line, &totalfiles);
- }
-
- if ((fp3 = fopen(outfile, "w")) == NULL) {
- sprintf(errorstr,
- "Couldn't write the merged index file \"%s\".",
- outfile);
- progerr(errorstr);
- }
-
- if (verbose)
- printf("\nMerging words... ");
-
- printheader(fp3, outfile, 0, totalfiles);
-
- offsetstart = ftell(fp3);
- for (i = 0; i < MAXCHARS; i++)
- fprintf(fp3, "%016li", offsets[i]);
- fputc('\n', fp3);
-
- readoffsets(fp1);
- readoffsets(fp2);
-
- for (i = 0; i < MAXCHARS; i++)
- offsets[i] = 0;
-
- skipwords = 0;
- while (1) {
- if (buffer1 == NULL) {
- ip1 = (struct indexentry *) readindexline(fp1, limit1);
- if (ip1 == NULL)
- break;
- buffer1 = ip1;
- }
- if (buffer2 == NULL) {
- ip2 = (struct indexentry *) readindexline(fp2, limit2);
- if (ip2 == NULL)
- break;
- addfilenums(ip2, indexfilenum1);
- buffer2 = ip2;
- }
- result = wordcompare(ip1->word, ip2->word);
- if (!result) {
- ip3 = (struct indexentry *) mergeindexentries(ip1, ip2);
- printindexentry(ip3, fp3);
- freeindexentry(ip1);
- freeindexentry(ip2);
- freeindexentry(ip3);
- buffer1 = buffer2 = NULL;
- skipwords++;
- }
- else if (result < 0) {
- printindexentry(ip1, fp3);
- freeindexentry(ip1);
- buffer1 = NULL;
- }
- else {
- printindexentry(ip2, fp3);
- freeindexentry(ip2);
- buffer2 = NULL;
- }
- }
-
- if (verbose) {
- if (skipwords)
- printf("%d redundant word%s.", skipwords,
- (skipwords == 1) ? "" : "s");
- else
- printf("no redundant words.");
- }
-
- printstopwords(fp3);
- fclose(fp3);
-
- if (verbose)
- printf("\nMerging file info... ");
- fp3 = fopen(outfile, "a+");
-
- offsets[FILELISTPOS] = ftell(fp3);
- for (i = j = 1; i <= indexfilenum1 + indexfilenum2; i++)
- if (getmap(i) == j) {
- addtofilehashlist(j++ - 1, ftell(fp3));
- fprintf(fp3, "%s", lookupindexfilenum(i));
- }
-
- skipfiles = (indexfilenum1 + indexfilenum2) - totalfiles;
- if (verbose) {
- if (skipfiles)
- printf("%d redundant file%s.", skipfiles,
- (skipfiles == 1) ? "" : "s");
- else
- printf("no redundant files.");
- }
-
- printfileoffsets(fp3);
-
- fseek(fp3, offsetstart, 0);
- for (i = 0; i < MAXCHARS; i++)
- fprintf(fp3, "%016li", offsets[i]);
- fclose(fp3);
-
- fclose(fp1);
- fclose(fp2);
-
- if (verbose)
- printf("\nDone.\n");
- }
-
- /* Gets the number of files in an index file.
- */
-
- int getindexfilenum(fp)
- FILE *fp;
- {
- int i;
- char line[MAXSTRLEN];
-
- readoffsets(fp);
- fseek(fp, offsets[FILELISTPOS], 0);
-
- i = 0;
- while(ftell(fp) != offsets[FILEOFFSETPOS]) {
- fgets(line, MAXSTRLEN, fp);
- i++;
- }
-
- return i;
- }
-
- /* This adds an offset to the file numbers in a particular
- ** result list. For instance, file 1 has file numbers going from
- ** 1 to 10, but so does file 2, so I have to add 10 to all the
- ** file numbers in file 2 before merging.
- */
-
- void addfilenums(ip, num)
- struct indexentry *ip;
- int num;
- {
- struct result *rp;
-
- rp = ip->result;
- while (rp != NULL) {
- rp->filenum =
- encodefilenum(getmap(decodefilenum(rp->filenum) + num));
- rp = rp->next;
- }
- }
-
- /* This reads the next line in the index file and puts the results
- ** in a result structure.
- */
-
- struct indexentry *readindexline(fp, limit)
- FILE *fp;
- long limit;
- {
- int i, c, x, countnum, rank, filenum, structure;
- char fileword[MAXWORDLEN];
- struct result *rp;
- struct indexentry *ip;
-
- rp = NULL;
-
- if (limit == ftell(fp))
- return NULL;
- for (i = 0; (c = fgetc(fp)) != 0; ) {
- if (c == ':') {
- fileword[i] = '\0';
- break;
- }
- else
- fileword[i++] = c;
- }
-
- countnum = 1;
-
- ungetc(c, fp);
- while ((c = fgetc(fp)) != 0) {
- x = 0;
- do {
- c = fgetc(fp);
- if (c == 0)
- break;
- x *= 128;
- x += c & 127;
- } while (c & 128);
- if (c == 0)
- break;
- if (x) {
- if (countnum == 1) {
- filenum = x;
- countnum++;
- }
- else if (countnum == 2) {
- rank = x;
- countnum++;
- }
- else if (countnum == 3) {
- structure = x;
- rp = (struct result *)
- addtoresultlist(rp, filenum,
- rank, structure);
- countnum = 1;
- }
- }
- }
-
- ip = (struct indexentry *) emalloc(sizeof(struct indexentry));
- ip->word = (char *) mystrdup(fileword);
- ip->result = rp;
-
- return ip;
- }
-
- /* This puts all the file info into a hash table so that it can
- ** be looked up by its pathname and filenumber. This is how
- ** we find redundant file information.
- */
-
- void addindexfilelist(num, info, totalfiles)
- int num;
- char *info;
- int *totalfiles;
- {
- int i;
- static int j;
- unsigned hashval;
- char tmpstr[MAXSTRLEN], path[MAXSTRLEN];
- struct indexfileinfo *ip1, *ip2;
-
- strcpy(path, extractpath(info));
-
- i = lookupindexfilepath(path);
- if (i != -1) {
- *totalfiles = *totalfiles - 1;
- remap(num, i);
- return;
- }
-
- remap(num, j + 1);
- j++;
-
- ip1 = (struct indexfileinfo *) emalloc(sizeof(struct indexfileinfo));
- ip1->filenum = num;
- ip1->fileinfo = (char *) mystrdup(info);
- ip1->path = (char *) mystrdup(path);
-
- sprintf(tmpstr, "%d", num);
- hashval = bighash(tmpstr);
- ip1->next = indexfilehashlist[hashval];
- indexfilehashlist[hashval] = ip1;
-
- ip2 = (struct indexfileinfo *) emalloc(sizeof(struct indexfileinfo));
- ip2->filenum = num;
- ip2->fileinfo = (char *) mystrdup(info);
- ip2->path = (char *) mystrdup(path);
-
- hashval = bighash(path);
- ip2->next = indexfilehashlist[hashval];
- indexfilehashlist[hashval] = ip2;
- }
-
- /* This extracts the pathname information from the file information
- ** line as stored in the index file.
- */
-
- char *extractpath(s)
- char *s;
- {
- int i;
- static char path[MAXSTRLEN];
-
- for (i = 0; s[i] && s[i] != '\"'; i++)
- path[i] = s[i];
- path[i - 1] = '\0';
- path[i] = '\0';
-
- return path;
- }
-
- /* This returns the file information corresponding to a file number.
- */
-
- char *lookupindexfilenum(num)
- int num;
- {
- unsigned hashval;
- char tmpstr[MAXSTRLEN];
- struct indexfileinfo *ip;
-
- sprintf(tmpstr, "%d", num);
- hashval = bighash(tmpstr);
- ip = indexfilehashlist[hashval];
-
- while (ip != NULL) {
- if (ip->filenum == num)
- return ip->fileinfo;
- ip = ip->next;
- }
- return NULL;
- }
-
- /* This returns the file number corresponding to a pathname.
- */
-
- int lookupindexfilepath(path)
- char *path;
- {
- unsigned hashval;
- struct indexfileinfo *ip;
-
- hashval = bighash(path);
- ip = indexfilehashlist[hashval];
-
- while (ip != NULL) {
- if (!strcmp(ip->path, path))
- return ip->filenum;
- ip = ip->next;
- }
- return -1;
- }
-
- /* This simply concatenates two information lists that correspond
- ** to a word found in both index files.
- */
-
- struct indexentry *mergeindexentries(ip1, ip2)
- struct indexentry *ip1;
- struct indexentry *ip2;
- {
- struct result *newrp, *rp1, *rp2;
- struct indexentry *ep;
-
- rp1 = ip1->result;
- rp2 = ip2->result;
- newrp = NULL;
-
- while (rp1 != NULL) {
- newrp = (struct result *) addtoresultlist(newrp,
- rp1->filenum, rp1->rank, rp1->structure);
- rp1 = rp1->next;
- }
- while (rp2 != NULL) {
- newrp = (struct result *) addtoresultlist(newrp,
- rp2->filenum, rp2->rank, rp2->structure);
- rp2 = rp2->next;
- }
-
- ep = (struct indexentry *) emalloc(sizeof(struct indexentry));
- ep->word = (char *) mystrdup(ip1->word);
- ep->result = newrp;
-
- return ep;
- }
-
- /* This prints a new word entry into the merged index file,
- ** removing redundant file information as it goes along.
- */
-
- void printindexentry(ip, fp)
- struct indexentry *ip;
- FILE *fp;
- {
- int i, num;
- struct result *rp;
-
- for (i = 0; indexchars[i] != '\0'; i++)
- if ((ip->word)[0] == indexchars[i] && !offsets[i])
- offsets[i] = ftell(fp);
- fprintf(fp, "%s:", ip->word);
- initmarkentrylist();
- rp = ip->result;
- while (rp != NULL) {
- num = rp->filenum;
- if (!ismarked(num)) {
- marknum(num);
- compress(num, fp);
- compress(rp->rank, fp);
- compress(rp->structure, fp);
- }
- rp = rp->next;
- }
- fputc(0, fp);
- }
-
- /* This associates a number with a new number.
- ** This function is used to remap file numbers from index
- ** files to a new merged index file.
- */
-
- void remap(oldnum, newnum)
- int oldnum;
- int newnum;
- {
- unsigned hashval;
- char tmpstr[MAXSTRLEN];
- struct mapentry *mp;
-
- mp = (struct mapentry *) emalloc(sizeof(struct mapentry));
- mp->oldnum = oldnum;
- mp->newnum = newnum;
-
- sprintf(tmpstr, "%d", oldnum);
- hashval = bighash(tmpstr);
- mp->next = mapentrylist[hashval];
- mapentrylist[hashval] = mp;
- }
-
- /* This retrieves the number associated with another.
- */
-
- int getmap(num)
- int num;
- {
- unsigned hashval;
- char tmpstr[MAXSTRLEN];
- struct mapentry *mp;
-
- sprintf(tmpstr, "%d", num);
- hashval = bighash(tmpstr);
- mp = mapentrylist[hashval];
-
- while (mp != NULL) {
- if (mp->oldnum == num)
- return mp->newnum;
- mp = mp->next;
- }
- return num;
- }
-
- /* This marks a number as having been printed.
- */
-
- void marknum(num)
- int num;
- {
- unsigned hashval;
- char tmpstr[MAXSTRLEN];
- struct markentry *mp;
-
- mp = (struct markentry *) emalloc(sizeof(struct markentry));
- mp->num = num;
-
- sprintf(tmpstr, "%d", num);
- hashval = bighash(tmpstr);
- mp->next = markentrylist[hashval];
- markentrylist[hashval] = mp;
- }
-
- /* Has a number been printed?
- */
-
- int ismarked(num)
- int num;
- {
- unsigned hashval;
- char tmpstr[MAXSTRLEN];
- struct markentry *mp;
-
- sprintf(tmpstr, "%d", num);
- hashval = bighash(tmpstr);
- mp = markentrylist[hashval];
-
- while (mp != NULL) {
- if (mp->num == num)
- return 1;
- mp = mp->next;
- }
- return 0;
- }
-
- /* Initialize the marking list.
- */
-
- void initmarkentrylist()
- {
- int i;
- struct markentry *mp;
-
- for (i = 0; i < BIGHASHSIZE; i++) {
- mp = markentrylist[i];
- if (mp != NULL)
- free(mp);
- markentrylist[i] = NULL;
- }
- }
-
- /* Initialize the main file list.
- */
-
- void initindexfilehashlist()
- {
- int i;
- struct indexfileinfo *ip;
-
- for (i = 0; i < BIGHASHSIZE; i++) {
- ip = indexfilehashlist[i];
- if (ip != NULL)
- free(ip);
- indexfilehashlist[i] = NULL;
- }
- }
-
- /* Frees up used index entries, my best attempt at memory management...
- ** I still have bytes leaking elsewhere...
- */
-
- void freeindexentry(ip)
- struct indexentry *ip;
- {
- struct result *rp, *oldp;
-
- free(ip->word);
- rp = ip->result;
- while (rp != NULL) {
- oldp = rp;
- rp = rp->next;
- free(oldp);
- }
- free(ip);
- }
-
- /* Translates a file number into something that can be compressed.
- */
-
- int encodefilenum(num)
- int num;
- {
- int i, j;
-
- for (i = j = 0; i != num; i++) {
- j++;
- if (!(j % 128))
- j++;
- }
- return j;
- }
-
- /* Translates a compressed file number into a correct file number.
- */
-
- int decodefilenum(num)
- int num;
- {
- int i, extra;
-
- for (i = 1, extra = 0; i < num; i++)
- if (!(i % 128)) {
- extra++;
- i++;
- }
- num -= extra;
-
- return num;
- }
-